iT邦幫忙

2023 iThome 鐵人賽

DAY 27
0
Modern Web

職缺資訊平台—Jobscanner系列 第 27

[開發] 透過 Cloud Functions 將職缺資料寫入 Firestore

  • 分享至 

  • xImage
  •  

檔案拆分

將檔案做拆分方便維護,例如:

├── constants
│   └── index.js
├── utils
│   ├── file.js
│   └── jobFormat.js
├── index.js
├── 104.js
├── cakeresume.js
├── yourator.js
└── package.json

constansts.js 放固定的常數

const BASE_URL = {
  one04: "https://www.104.com.tw/jobs/search/",
  yourator: "https://www.yourator.co/api/v4/jobs",
  cakeresume: "https://www.cakeresume.com/jobs",
};

const KEYWORD_LIST = ["前端", "Front-end"];

module.exports = { BASE_URL, KEYWORD_LIST };

jobFormat.js 用來放整理資料結構或內容的函式

// utils/jobFormat.js
const { BASE_URL } = require("../constants.js");

const convertSalaryFormat = (jobList) => {
  const keyword = {
    year: "年",
    month: "月",
    tenThousand: "萬",
    dollar: "元",
  };

  let data = jobList.map((job) => {
    let salary = [],
      salaryType = "";

    // salaryType 區分類別
    if (job.salary.includes(keyword.month)) {
      salaryType = "month";
    } else if (job.salary.includes(keyword.year)) {
      salaryType = "year";
    } else {
      salaryType = "other";
    }

    // 以 - 或 ~ 切分,將 salary 轉成陣列
    salary = job.salary.split(/-|~/);

    salary = salary.map((item) => {
      // match 萬字前面的數值
      const withWordRegex = new RegExp(`\\d+\.?\\d?(?=${keyword.tenThousand})`);
      // match 數值
      const withThousandSeparatorRegex = /(\d+,?)+/;
      let value;

      if (item.includes(keyword.tenThousand)) {
        value = item.match(withWordRegex)[0] * 10000;
      } else if (withThousandSeparatorRegex.test(item)) {
        value = Number(
          item.match(withThousandSeparatorRegex)[0].replaceAll(",", "")
        );
      } else {
        value = item;
      }
      return value;
    });
    return {
      ...job,
      salary,
      salaryType,
    };
  });

  return data;
};

module.exports = { convertJobListFromYourator, convertSalaryFormat };

建立 Functions

流程:由 Cloud Scheduler 呼叫 Cloud Functions,Functions 抓取資料並整理後存至 Firestore。

Cloud Functions

index.js 內容,Entry Point: init

// Cloud Functions
const functions = require('@google-cloud/functions-framework');

const { KEYWORD_LIST } = require("./constants");
const { convertSalaryFormat } = require("./utils/jobFormat.js");
const fetch104Job = require("./104.js");
const fetchYouratorJob = require("./yourator.js");
const fetchCakeresumeJob = require("./cakeresume.js");

// Firestore
const { initializeApp, cert } = require('firebase-admin/app');
const { getFirestore, FieldValue } = require('firebase-admin/firestore');

const serviceAccount = require('./serviceAccountKey.json');
initializeApp({
     credential: cert(serviceAccount)
});

const db = getFirestore();

let startPage = 1;
let endPage = 10;

functions.http('init', async(req, res) => {
  // 1. 抓取資料
  let result = await fetchData();

  // 2. 統一薪資格式
  let data = convertSalaryFormat(result);
  // 3. url 移除非文字,作為 key
  data = data.map(item => {
    return {
      ...item,
      key: item.url.replace(/\W/g, '')
    }});

   // 4. 遍歷 data,以 job.key 為 document ID 依序寫入資料
   data.forEach(async(job) => {
    await db.collection('jobList').doc(job.key).set({...job, timestamp: FieldValue.serverTimestamp()});
   })
   res.status(200).send(`Total is ${result.length}`);
});

const getAll104Job = async (keyword, startPage) => {
  let page = startPage;
  let jobs = [];
  let isEmpty = false;
  do {
    let data = await fetch104Job(keyword, page);
    jobs = [...jobs, ...data];
    isEmpty = !data.length;
    page += 1;
  } while (page <= endPage && !isEmpty);

  return jobs;
};

const getAllYouratorJob = async (keyword, startPage) => {
  // 略
};

const getAllCakeresumeJob = async (keyword, startPage) => {
  // 略
};

// 每一組關鍵字都要 call getAll104Job & getAllYouratorJob & getAllCakeresumeJob
const fetchData = async () => {
  let result = [];
  for (let i = 0; i < KEYWORD_LIST.length; i++) {
    let one04DataArr = await getAll104Job(KEYWORD_LIST[i], startPage);
    let youratorDataArr = await getAllYouratorJob(KEYWORD_LIST[i], startPage);
    let cakeResumeDataArr = await getAllCakeresumeJob(KEYWORD_LIST[i], startPage);
    result = [...result, ...one04DataArr, ...youratorDataArr, ...cakeResumeDataArr];
  }
  return result;
};

*寫入 Firestore

db.collection("jobList")
  .doc(job.key)
  .set({
    ...job,
    timestamp: FieldValue.serverTimestamp(),
  });

collection:jobList
docID : 職缺 URL (已移除非文字部分)
feild:name、companyName、salary...等,再加上 timestamp (代表此筆資料最新抓取的時間)


建立 Cloud Scheduler 於每日 8:00 觸發 Cloud Functions 執行,可於 Firestore 介面看到寫入的資料

https://ithelp.ithome.com.tw/upload/images/20231012/20128122HCfQcIiA2B.png


上一篇
[開發] 資料彙整 - 評估
下一篇
[開發] 借站巨人的肩膀 -Chakra UI
系列文
職缺資訊平台—Jobscanner31
圖片
  直播研討會
圖片
{{ item.channelVendor }} {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言